import torch
torch.cuda.set_device("cuda:0")
import numpy as np
from scipy.io.wavfile import read, write
import matplotlib.pyplot as plt
import os, sys, json
from IPython.display import Audio
import warnings
warnings.filterwarnings("ignore")
%run data_eda.py
%run TTS_module.py
%run test_waveglow_fn.py

audio, sr, text = load_audio('data/uploads/04/04-01.mp3', 'data/04_01.txt')
for i in range(8,14):
print(text[i])
audio = audio[second_to_sr(text[6][0]): second_to_sr(text[13][0])]
Audio(audio,rate=sr)
plt.figure(figsize=(15, 4))
librosa.display.waveplot(audio, sr, alpha=0.8)
plot_spectrogram_to_numpy(get_mel_fromnumpy(audio, sr))
audio, sr, text = load_audio('data/uploads/13/13-01.mp3', 'data/13_01.txt')
for i in range(9,15):
print(text[i])
audio = audio[second_to_sr(text[8][0]): second_to_sr(text[14][0])]
Audio(audio,rate=sr)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(audio, sr, alpha=0.8)
plot_spectrogram_to_numpy(get_mel_fromnumpy(audio, sr))
audio, sr, text = load_audio('data/uploads/07/07-02.mp3', 'data/07_02.txt')
for i in range(9,15):
print(text[i])
Audio(audio[second_to_sr(text[8][0]): second_to_sr(text[14][0])],rate=sr)
audio, sr, text = load_audio('data/uploads/11/11-01.mp3', 'data/11_01.txt')
for i in range(9,15):
print(text[i])
Audio(audio[second_to_sr(text[8][0]): second_to_sr(text[14][0])],rate=sr)
audio, sr, text = load_audio('data/uploads/01/เล่ม1 ตอน2 เจ้าป่าห้วยเสือโฮก.mp3', 'data/01_02.txt')
for i in range(9,15):
print(text[i])
Audio(audio[second_to_sr(text[8][0]): second_to_sr(text[15][0])],rate=sr)
Text-to-speech model using gTTS generate dataset on Tacotron2 and pretrained model (on LJspeech data) with additional fine-tuning WaveGlow model.
TEXT = 'สวัสดีคนไทยกินผัดไทย ไม่ใส่เครื่องใน'
SAMPLE_RATE = 22050
mel, audio = TTS_system(TEXT, viz=True)
The upper Figure shows the alignment weight between encoder and decoder. The non-diagonal at the end of the alignment suggested that the model learned the long drag sound which is one of the Thai language characteristics.
The lower Figure shows the spectrogram generated by the Tacotron2 model infering from given text.
The audio sample infering from the WaveGlow model conditioning by the mel-spectrogram from Tacotron2 model
Audio(audio, rate=SAMPLE_RATE)
plt.figure(figsize=(15, 4))
plt.plot(audio)
frequency_spectrum(audio, SAMPLE_RATE)
Comparing the inverse audio with the traditional method (without using trainable model)
audio_grif = griffin_lim_transform(mel, n_iters=10000)
Audio(audio_grif, rate=SAMPLE_RATE)
plt.figure(figsize=(15, 4))
plt.plot(audio_grif)
TEXT = 'สวัสดีค่ะ วัลแคน ยินดีให้บริการค่ะ'
mel, audio = TTS_system(TEXT)
Audio(audio, rate=22050)
TEXT = 'ข้อมูลชุดนี้ มาจากกูเกิ้ลค่ะ'
mel, audio = TTS_system(TEXT)
Audio(audio, rate=22050)
TEXT = 'รอติดตามโมเดลจากข้อมูลเสียงจริง ต่อไปนะคะ'
mel, audio = TTS_system(TEXT)
Audio(audio, rate=22050)
TEXT = 'วันจันทร์อากาศสดใส เหมาะกับการไปเดินทะเล'
mel, audio = TTS_system(TEXT)
Audio(audio, rate=22050)
TEXT = 'วันนี้ของคุณ เป็นอย่างไรบ้างคะ'
mel, audio = TTS_system(TEXT)
Audio(audio, rate=22050)